Loading the required packages
library(tidyverse)
library(dplyr)
library(ggplot2)
library(rtweet)
library(readr)
library(DataExplorer)
Import processed data, which can be found here.
#read preprocessed data
wines <- read.csv(file = '../data/processed_data/wines.csv')
Get sample of dataset
#set seed value to birthday of Ricardo Rodriguez, American wrestler and ring announcer and Dr. Reinaldo (Rei) Sanchez-Arias
set.seed(19630217)
#set percentage to test with for simplicity, if needed
percentage <- 5
wine_sample<- sample_n(wines, percentage/100*nrow(wines))
tasters <- wines %>%
select(taster_name, taster_twitter_handle) %>% unique()
tasters
Drop taster_twitter_handle in wines dataframe
wines <- wines %>%
select(-taster_twitter_handle)
head(wines)
Each reviewer has there own bias. To offset that we made a “profile” for each reviewer which includes characteristics like: avg_points, sd_points, and var_points
taster_rating_profile <- wines %>%
group_by(taster_name) %>%
summarize(
avg_points = mean(points),
sd_points = sd(points),
var_points = var(points),
reviews = n()
)
tasters <- inner_join(tasters, taster_rating_profile, by = "taster_name")
head(tasters)
Add following classification to wine dataset as found on the website:
| Category | Rating | Description |
|---|---|---|
| Classic | 98-100 | The pinnacle of quality. |
| Superb | 94-97 | A great achievement. |
| Excellent | 90-93 | Highly recommended. |
| Very Good | 87-89 | Often good value; well recommended. |
| Good | 83-86 | Suitable for everyday consumption; often good value. |
| Acceptable | 80-82 | Can be employed in casual, less-critical circumstances |
# function to add rating
rating_category <- function(points){
if(points>=98){
return("Classic")
}
else if (points>=94){
return("Superb")
}
else if(points>=90){
return("Excellent")
}
else if(points>=87){
return("Very Good")
}
else if(points>=83){
return("Good")
}
else{
return("Acceptable")
}
}
wines<- wines %>%
rowwise() %>%
mutate(rating_category = rating_category(points))
head(wines)
Since, each reviewer has a different bias we created a normalized metric, norm_points, by looking at the number of standard deviatioins a wine is from the reviewer’s avg_points. This gives use a more accurate representation of which which wines are better than the rest.
normalize_points <- function(data){
left_join(data, tasters, by = "taster_name")%>%
rowwise() %>%
mutate(norm_points = (points-avg_points)/sd_points) %>%
select(-avg_points, -sd_points, -var_points, -taster_twitter_handle, -reviews)
}
wines <- normalize_points(wines)
head(wines)
Vintage seems to have year 7200
wines <- wines %>%
filter(vintage<2019)
Correlating price by points, using DataExplorer library which can be found here
wines %>%
group_by(alcohol) %>%
ggplot(mapping = aes(x = alcohol)) +
geom_histogram(na.rm = T,
bins = 50) +
scale_x_continuous(
name = "Alchohol Percentage",
breaks = seq(0,25,1),
limits = c(4,22))
Grouping rowwise data frame strips rowwise nature
wines %>%
# group_by(points) %>%
# count(category) %>%
ggplot() +
facet_wrap(~ category) +
geom_point(mapping = aes(x=points, y = price))
Count wines per year (Note: Data has been sanitized)
wines %>%
group_by(vintage) %>%
summarize(count = n())
Grouping rowwise data frame strips rowwise nature
wines %>%
ggplot() +
geom_bar(mapping = aes(x=vintage))
To better understand the number wines per winery, we did a univarite visualization that counts the number of wines per winery showing only 15 winerys to give you an idea what winery has the most selction of wines.
wines %>%
group_by(winery) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
slice(1:15) %>%
ggplot() +
geom_col(mapping = aes(x=count, y = reorder(winery, count)))
Grouping rowwise data frame strips rowwise nature
To better understand the number wines per province, we did a univarite visualization that counts the number of wines per province showing only the top 10 provinces with the most wines. This can give the reader an idea where their wine will most likely be made with California standing out as a clear leader.
wines %>%
group_by(province) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
slice(1:10) %>%
ggplot()+
geom_col(aes(x = count, y = reorder(province, count)))
Grouping rowwise data frame strips rowwise nature
Calculating the Mean, Standard Deviation, Minimum, and Max Price for the entire wine dataset and printing the values.
mean_price <- mean(wines$price, na.rm = TRUE)
sd_price <- sd(wines$price, na.rm = TRUE)
min_price <- min(wines$price, na.rm = TRUE)
max_price <- max(wines$price, na.rm = TRUE)
print(paste("Mean Price:", mean_price))
[1] "Mean Price: 35.4748507788616"
print(paste("SD Price:", sd_price))
[1] "SD Price: 41.238007633635"
print(paste("Min Price:", min_price))
[1] "Min Price: 4"
print(paste("Max Price:", max_price))
[1] "Max Price: 3300"
Calculating the Mean, Standard Deviation, Minimum, and Max Points for the entire wine dataset and printing the values.
mean_points <- mean(wines$points, na.rm = TRUE)
sd_points <- sd(wines$points, na.rm = TRUE)
min_points <- min(wines$points, na.rm = TRUE)
max_points <- max(wines$points, na.rm = TRUE)
print(paste("Mean Points:", mean_points))
[1] "Mean Points: 88.4744820916541"
print(paste("SD Points:", sd_points))
[1] "SD Points: 3.05417480898736"
print(paste("Min Points:", min_points))
[1] "Min Points: 80"
print(paste("Max Points:", max_points))
[1] "Max Points: 100"
To help you understand the point distribution by reviewers, we did a multivarite visualization that coorelates some taster names based on the average wine points as identified by the x-intercept. This give you the reader an idea of how some reviewers correlate to the overall average.
wines %>%
ggplot() +
geom_boxplot(aes(y=taster_name, x=points)) +
geom_vline(xintercept = mean(wines$points))
Notice the data is “stacked” and the socres range from 80-100
wines %>%
ggplot() +
geom_point(mapping = (aes(x = points, y = price)), na.rm = T, alpha = 0.15) +
labs(title = "Price by Points", x = "Points", y = "Price")
TODO: IZZY (Why did we log this?)
wines %>%
ggplot() +
geom_point(mapping = (aes(x = points, y = log(price))), na.rm = T, alpha = 0.15) +
labs(title = "log(Price) by Points", x = "Points", y = "log(Price)")
To help you understand the data analysis, we found the best province for wine by using the average points across the wines.
mean_points <- mean(wines$points)
best_province <- wines %>%
summarise(province) %>%
filter(points > mean_points) %>%
arrange(desc(points)) %>%
ggplot() +
geom_col(mapping = aes(x= province, y = points))
Error in points > mean_points :
comparison (6) is possible only for atomic and list types
Best wine, by variety